From Command Line - Import CSV file (Raw Data) into MongoDB

mongoimport --db airbnb --type csv --file listings_new.csv -c listings_new

mongoimport --db airbnb --type csv --file barcelona_attractions.csv -c attractions


In [1]:
import pymongo
from pymongo import MongoClient

Connect Python to MongoDB


In [2]:
client = MongoClient('mongodb://localhost:27017/')

Retrieve from Database

Database named as "airbnb"


In [3]:
db = client.airbnb

Retrieve Tables from Database


In [36]:
listings = db.listings_new
attractions = db.attractions

Store data in a pandas dataframe for further analysis


In [5]:
import pandas as pd

In [6]:
listings_df = pd.DataFrame(list(db.listings_new.find()))

In [7]:
listings_df.head()


Out[7]:
_id access accommodates amenities availability_30 availability_365 availability_60 availability_90 bathrooms bed_type ... space square_feet state street summary thumbnail_url transit weekly_price xl_picture_url zipcode
0 58f75b849ebfacefa7df7620 Los huéspedes podrán utilizar las zonas comune... 2 {TV,Internet,"Wireless Internet",Kitchen,Break... 29 364 59 89 1 Real Bed ... En pleno corazón de Barcelona, frente a rambla... Catalunya Carrer de Sant Josep Oriol, Barcelona, Catalun... Mi casa esta ubicada en el corazón de Barcelon... https://a2.muscache.com/im/pictures/34156f3a-4... Todo tipo de transporte: metro, bus, tren, bic... https://a2.muscache.com/im/pictures/34156f3a-4... 8001
1 58f75b849ebfacefa7df7621 We share common areas of apartment: bathroom, ... 2 {TV,"Cable TV",Internet,"Wireless Internet","A... 26 86 56 86 1 Real Bed ... Let me introduce you my very nice apartment ... Catalonia Carrer del Peu de la Creu, Barcelona, Cataloni... The flat is located in Sant Antoni (between Ra... https://a2.muscache.com/im/pictures/826cf18f-6... There is a bike-rental around the corner for d... https://a2.muscache.com/im/pictures/826cf18f-6... 8001
2 58f75b849ebfacefa7df7622 2 {Internet,"Wireless Internet","Wheelchair acce... 28 88 58 88 1 Real Bed ... CHECK-IN is before 23:00,CHECK-OUT is at 12:00... Catalunya Carrer del Cid, Barcelona, Catalunya 08001, Spain Located in the Old City, near the famous RAMBL... https://a2.muscache.com/im/pictures/f89056a4-f... Very suitable for tourist visits,example dista... https://a2.muscache.com/im/pictures/f89056a4-f... 8001
3 58f75b849ebfacefa7df7623 You will have to share the living room and the... 2 {Internet,"Wireless Internet",Kitchen,"Smoking... 0 158 0 0 1 Real Bed ... Rambla del Raval, Barcelona, 08001, Spain Double bedroom in the earth of the city, in th... https://a2.muscache.com/im/pictures/d07ab326-0... The flat is near the buses going in few minute... https://a2.muscache.com/im/pictures/d07ab326-0... 8001
4 58f75b849ebfacefa7df7624 4 {"Wireless Internet",Kitchen,"Smoking allowed"... 5 329 28 58 2 Real Bed ... Welcome to the Piero's home (Italian)! The h... 172 Cataluña, Catalonia Carrer de la Junta de Comerç, Barcelona, Catal... https://a0.muscache.com/im/pictures/10003580/b... https://a0.muscache.com/im/pictures/10003580/b... 8001

5 rows × 96 columns


In [8]:
listings_df.columns.values


Out[8]:
array(['_id', 'access', 'accommodates', 'amenities', 'availability_30',
       'availability_365', 'availability_60', 'availability_90',
       'bathrooms', 'bed_type', 'bedrooms', 'beds',
       'calculated_host_listings_count', 'calendar_last_scraped',
       'calendar_updated', 'cancellation_policy', 'city', 'cleaning_fee',
       'country', 'country_code', 'description', 'experiences_offered',
       'extra_people', 'first_review', 'guests_included',
       'has_availability', 'host_about', 'host_acceptance_rate',
       'host_has_profile_pic', 'host_id', 'host_identity_verified',
       'host_is_superhost', 'host_listings_count', 'host_location',
       'host_name', 'host_neighbourhood', 'host_picture_url',
       'host_response_rate', 'host_response_time', 'host_since',
       'host_thumbnail_url', 'host_total_listings_count', 'host_url',
       'host_verifications', 'house_rules', 'id', 'instant_bookable',
       'interaction', 'is_location_exact', 'jurisdiction_names',
       'last_review', 'last_scraped', 'latitude', 'license', 'listing_url',
       'longitude', 'market', 'maximum_nights', 'medium_url',
       'minimum_nights', 'monthly_price', 'name', 'neighborhood_overview',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'notes', 'number_of_reviews',
       'picture_url', 'price', 'property_type',
       'require_guest_phone_verification', 'require_guest_profile_picture',
       'requires_license', 'review_scores_accuracy',
       'review_scores_checkin', 'review_scores_cleanliness',
       'review_scores_communication', 'review_scores_location',
       'review_scores_rating', 'review_scores_value', 'reviews_per_month',
       'room_type', 'scrape_id', 'security_deposit', 'smart_location',
       'space', 'square_feet', 'state', 'street', 'summary',
       'thumbnail_url', 'transit', 'weekly_price', 'xl_picture_url',
       'zipcode'], dtype=object)

Convert numeric variables


In [9]:
listings_df = listings_df.convert_objects(convert_numeric=True)


C:\Users\Yen Leng\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':

In [10]:
listings_df['price'] = listings_df['price'].str[1:]
listings_df['price'] = listings_df.price.replace(',', '',regex=True)
listings_df['price'] = listings_df.price.astype(float).fillna(0.0)

In [11]:
listings_df['extra_people'] = listings_df['extra_people'].str[1:]
listings_df['extra_people'] = listings_df.extra_people.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['extra_people'] = listings_df.extra_people.astype(float).fillna(0.0)

In [12]:
listings_df['weekly_price'] = listings_df['weekly_price'].str[1:]
listings_df['weekly_price'] = listings_df.weekly_price.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['weekly_price'] = listings_df.weekly_price.astype(float).fillna(0.0)

In [13]:
listings_df['monthly_price'] = listings_df['monthly_price'].str[1:]
listings_df['monthly_price'] = listings_df.monthly_price.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['monthly_price'] = listings_df.monthly_price.astype(float).fillna(0.0)

In [14]:
listings_df['security_deposit'] = listings_df['security_deposit'].str[1:]
listings_df['security_deposit'] = listings_df.security_deposit.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['security_deposit'] = listings_df.security_deposit.astype(float).fillna(0.0)

In [15]:
listings_df['cleaning_fee'] = listings_df['cleaning_fee'].str[1:]
listings_df['cleaning_fee'] = listings_df.cleaning_fee.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['cleaning_fee'] = listings_df.cleaning_fee.astype(float).fillna(0.0)

Convert Amenities to Dummy Variables


In [17]:
listings_df['amenities_split'] = listings_df["amenities"].apply(lambda x: x[1:-1].split(','))

In [30]:
#Get unique amenities
unique_amenities = list(set(x for l in listings_df["amenities_split"] for x in l))
unique_amenities = unique_amenities[0:2] + unique_amenities[3:]
unique_amenities


Out[30]:
['"Elevator in building"',
 '"translation missing: en.hosting_amenity_50"',
 'Gym',
 '"Washer / Dryer"',
 '"Buzzer/wireless intercom"',
 '"Other pet(s)"',
 '"Fire extinguisher"',
 'Washer',
 '"Wheelchair accessible"',
 '"Hot tub"',
 'Hangers',
 'Doorman',
 '"Smoke detector"',
 'Cat(s)',
 '"Laptop friendly workspace"',
 'Pool',
 '"Lock on bedroom door"',
 '"Free parking on street"',
 '"Pets allowed"',
 '"Safety card"',
 'Dog(s)',
 '"Carbon monoxide detector"',
 'Internet',
 '"Air conditioning"',
 'Essentials',
 'Kitchen',
 'Shampoo',
 '"First aid kit"',
 'TV',
 '"Wireless Internet"',
 'Dryer',
 'Breakfast',
 '"translation missing: en.hosting_amenity_49"',
 '"Pets live on this property"',
 '"Private entrance"',
 '"Cable TV"',
 '"Paid parking off premises"',
 '"24-hour check-in"',
 'Iron',
 '"Family/kid friendly"',
 '"Suitable for events"',
 '"Indoor fireplace"',
 '"Free parking on premises"',
 'Heating',
 '"Hair dryer"',
 '"Smoking allowed"',
 '"Private living room"']

In [32]:
num_col = len(unique_amenities) #number of columns
data_array = []
for n in range(0, len(listings_df)):
    lst = []
    for i in range (0, num_col):
        row = listings_df["amenities_split"][n]
        if unique_amenities[i] in row:
            lst.append(1)
        else:
            lst.append(0)
    data_array.append(lst)

df = pd.DataFrame(data_array, columns=unique_amenities)

In [41]:
listings_df2 = listings_df.join(df)

In [42]:
listings_df2.head()


Out[42]:
_id access accommodates amenities availability_30 availability_365 availability_60 availability_90 bathrooms bed_type ... "24-hour check-in" Iron "Family/kid friendly" "Suitable for events" "Indoor fireplace" "Free parking on premises" Heating "Hair dryer" "Smoking allowed" "Private living room"
0 58f75b849ebfacefa7df7620 NaN 2 {TV,Internet,"Wireless Internet",Kitchen,Break... 29 364 59 89 1.0 Real Bed ... 1 1 0 0 0 0 1 1 0 0
1 58f75b849ebfacefa7df7621 NaN 2 {TV,"Cable TV",Internet,"Wireless Internet","A... 26 86 56 86 1.0 Real Bed ... 0 1 0 0 0 0 1 1 0 0
2 58f75b849ebfacefa7df7622 NaN 2 {Internet,"Wireless Internet","Wheelchair acce... 28 88 58 88 1.0 Real Bed ... 0 0 1 0 0 0 1 1 0 0
3 58f75b849ebfacefa7df7623 NaN 2 {Internet,"Wireless Internet",Kitchen,"Smoking... 0 158 0 0 1.0 Real Bed ... 0 0 0 0 0 0 0 0 1 0
4 58f75b849ebfacefa7df7624 NaN 4 {"Wireless Internet",Kitchen,"Smoking allowed"... 5 329 28 58 2.0 Real Bed ... 0 0 0 0 0 0 0 0 1 0

5 rows × 144 columns

Combine Attractions Data


In [37]:
attractions = pd.DataFrame(list(db.attractions.find()))

In [38]:
attractions.head()


Out[38]:
_id attraction lat long rating
0 58f7630c9ebfacefa7dfbb00 Basilica of the Sagrada Familia 41.403591 2.174360 4
1 58f7630c9ebfacefa7dfbb01 Casa Batllo 41.391712 2.164824 4
2 58f7630c9ebfacefa7dfbb02 Palau de la Musica Orfeo Catala 41.387573 2.175302 4
3 58f7630c9ebfacefa7dfbb03 Gothic Quarter (Barri Gotic) 41.382702 2.177113 4
4 58f7630c9ebfacefa7dfbb04 Camp Nou 41.380978 2.122884 4

In [39]:
#Calculate distance between 2 lat long points
#Returns distance in km
def distance(lat1, long1, lat2, long2):
    from math import sin, cos, sqrt, atan2, radians

    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat1)
    long1 = radians(long1)
    lat2 = radians(lat2)
    long2 = radians(long2)

    dlong = long2 - long1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlong / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [43]:
for n in range(0, len(listings_df2)):
    nearest_attr = attractions['attraction'][0]
    nearest_attr_rating = attractions['rating'][0]
    nearest_attr_lat = attractions['lat'][0]
    nearest_attr_long = attractions['long'][0]

    list_lat = listings_df2['latitude'][n]
    list_long = listings_df2['longitude'][n]

    #Distance from first attraction to listing
    dist_nearest = distance(list_lat, list_long, nearest_attr_lat, nearest_attr_long)

    for i in range(1, len(attractions)):
        attr_lat = attractions['lat'][i]
        attr_long = attractions['long'][i]
        dist = distance(list_lat, list_long, attr_lat, attr_long)

        if dist < dist_nearest:
            nearest_attr = attractions['attraction'][i]
            nearest_attr_rating = attractions['rating'][i]
            nearest_attr_lat = attractions['lat'][i]
            nearest_attr_long = attractions['long'][i]
            dist_nearest = dist  

    listings_df2.loc[n, 'nearest_attr'] = nearest_attr
    listings_df2.loc[n, 'nearest_attr_rating'] = nearest_attr_rating
    listings_df2.loc[n, 'nearest_attr_lat'] = nearest_attr_lat
    listings_df2.loc[n, 'nearest_attr_long'] = nearest_attr_long
    listings_df2.loc[n, 'nearest_attr_dist'] = dist_nearest

In [44]:
listings_df2.head()


Out[44]:
_id access accommodates amenities availability_30 availability_365 availability_60 availability_90 bathrooms bed_type ... "Free parking on premises" Heating "Hair dryer" "Smoking allowed" "Private living room" nearest_attr nearest_attr_rating nearest_attr_lat nearest_attr_long nearest_attr_dist
0 58f75b849ebfacefa7df7620 NaN 2 {TV,Internet,"Wireless Internet",Kitchen,Break... 29 364 59 89 1.0 Real Bed ... 0 1 1 0 0 Cat, Fernando Botero 4.0 41.378109 2.169838 0.116424
1 58f75b849ebfacefa7df7621 NaN 2 {TV,"Cable TV",Internet,"Wireless Internet","A... 26 86 56 86 1.0 Real Bed ... 0 1 1 0 0 Articket Barcelona 4.0 41.382950 2.167290 0.075337
2 58f75b849ebfacefa7df7622 NaN 2 {Internet,"Wireless Internet","Wheelchair acce... 28 88 58 88 1.0 Real Bed ... 0 1 1 0 0 Arts Santa Monica 4.0 41.377441 2.175973 0.129260
3 58f75b849ebfacefa7df7623 NaN 2 {Internet,"Wireless Internet",Kitchen,"Smoking... 0 158 0 0 1.0 Real Bed ... 0 0 0 1 0 Cat, Fernando Botero 4.0 41.378109 2.169838 0.057454
4 58f75b849ebfacefa7df7624 NaN 4 {"Wireless Internet",Kitchen,"Smoking allowed"... 5 329 28 58 2.0 Real Bed ... 0 0 0 1 0 Teatro Romea 5.0 41.380562 2.171060 0.049501

5 rows × 149 columns


In [45]:
#listings_df2.to_csv("listings_31Mar.csv")

In [ ]: